library(haven)
library(tidyverse)
library(mgcv)
library(tidygam)

# read in data
data_16 <- read_xpt('LLCP2016.XPT')

# check distributions
view(colnames(data_16))
table(data_16$`_AGE80`)
table(data_16$SEX)

# remove cases with missing values for gender identity; 7 = don't know, 9 = refused
table(data_16$TRNSGNDR)
mean(data_16$TRNSGNDR > 5, na.rm = T) # percentage of missing values for gender identity
# in states where the question was asked
sum(data_16$TRNSGNDR > 5, na.rm = T) # sum of missing values for gender identity
# in states where the question was asked
sum(is.na(data_16$TRNSGNDR))

df.16 <- data_16 %>% # remove missing cases for gender identity
  filter(TRNSGNDR < 5)

df.16$age <- df.16$'_AGE80'

# count missing values for age (top-coding)
mean(df.16$age == 80, na.rm = T)
sum(df.16$age == 80, na.rm = T)

df.16 <- df.16 %>%
  filter(age < 80) %>% # remove missing values for age
  mutate(trans = if_else(TRNSGNDR < 4, 1, 0),
         transfem = if_else(TRNSGNDR == 1, 1, 0),
         transmasc = if_else(TRNSGNDR == 2, 1, 0),
         gnc = if_else(TRNSGNDR == 3, 1, 0),
         gender_id = case_when(TRNSGNDR == 1 ~ "Transfeminine",
                               TRNSGNDR == 2 ~ "Transmasculine",
                               TRNSGNDR == 3 ~ "Gender nonconforming",
                               TRNSGNDR == 4 & SEX == 1 ~ "Cis man",
                               TRNSGNDR == 4 & SEX == 2 ~ "Cis woman"))

#Age distributions

df.16 %>% #all participants
  ggplot() +
  aes(x = age) +
  geom_histogram(binwidth = 2, col = "deepskyblue4", fill = "deepskyblue3")

df.16 %>% #trans participants
  filter(trans == 1) %>%
  ggplot() +
  aes(x = age) +
  geom_histogram(binwidth = 2, col = "deepskyblue4", fill = "deepskyblue3")

#Clean covariates
df.16$race2 <- df.16$`_MRACE1`
df.16$ethnicity2 <- df.16$`_HISPANC`
df.16 <- df.16 %>%
  mutate(ethnicity = case_when(ethnicity2 == 1 ~ "Hispanic",
                               ethnicity2 == 2 ~ "Non-Hispanic"),
         
         race = case_when(race2 == 1 ~ "White",
                          race2 == 2 ~ "Black",
                          race2 %in% c(3,5,6) ~ "Other",
                          race2 == 4 ~ "Asian",
                          race2 == 7 ~ "Multiracial"))

df.16 <- df.16 %>%
  mutate(edu = case_when(EDUCA < 4 ~ "Less than HS",
                         EDUCA < 5 ~ "HS",
                         EDUCA < 6 ~ "Some college",
                         EDUCA == 6 ~ "College degree"))

df.16 <- df.16 %>%
  mutate(diabetes = case_when(DIABETE3 == 1 ~ 1,
                              DIABETE3 %in% c(2:4) ~ 0),
         diabetes_early = case_when(DIABAGE2 < 25 ~ 1,
                                    DIABAGE2 %in% c(26:80) ~ 0),
         depression = case_when(ADDEPEV2 == 1 ~ 1,
                                ADDEPEV2 == 2 ~ 0),
         arthritis = case_when(HAVARTH3 == 1 ~ 1,
                               HAVARTH3 == 2 ~ 0))

#Clean health outcomes
df.16 <- df.16 %>%
  mutate(fair_poor = case_when(GENHLTH < 4 ~ 0,
                               GENHLTH < 6 ~ 1))

df.16$trans_fac <- factor(df.16$trans,
                          levels = c(0,1),
                          labels = c("Cis", "Trans"))

df.16$weights <- df.16$'_LLCPWT'

df.16$id <- df.16$'_PSU'

df.16$strata <- df.16$'_STSTR'

df.16$state_id <- df.16$'_STATE'

df.16$race1 <- df.16$`_RACE1`

df.16$blind <- case_when(df.16$BLIND == 2 ~ 0,
                         df.16$BLIND == 1 ~ 1)
df.16$deaf <- case_when(df.16$DEAF == 2 ~ 0,
                        df.16$DEAF == 1 ~ 1)
df.16$decide <- case_when(df.16$DECIDE == 2 ~ 0,
                          df.16$DECIDE == 1 ~ 1)
df.16$walk <- case_when(df.16$DIFFWALK == 2 ~ 0,
                        df.16$DIFFWALK == 1 ~ 1)
df.16$dress <- case_when(df.16$DIFFDRES == 2 ~ 0,
                         df.16$DIFFDRES == 1 ~ 1)
df.16$alone <- case_when(df.16$DIFFALON == 2 ~ 0,
                         df.16$DIFFALON == 1 ~ 1)

df.16$disabled <- if_else(df.16$blind + df.16$deaf + df.16$decide + df.16$walk + df.16$dress + df.16$alone > 0, 1, 0)

#view(colnames(df.22))

clean_16 <- df.16 %>%
  select(SEX, HTIN4, age:disabled)

clean_16 <- clean_16 %>%
  mutate(year = 2016)

write_csv(clean_16, "Clean data 2016.csv")
